# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(ggplot2)
library(summarytools)
##
## Attaching package: 'summarytools'
##
## The following object is masked from 'package:tibble':
##
## view
library(ggcorrplot)
library(sf)
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(tmap)
## Breaking News: tmap 3.x is retiring. Please test v4, e.g. with
## remotes::install_github('r-tmap/tmap')
library(viridis)
## Loading required package: viridisLite
library(readr)
library(treemap)
# Load the datasets
estimated_numbers <- read_csv("estimated_numbers.csv")
## Rows: 856 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Country, No. of cases, No. of deaths, WHO Region
## dbl (7): Year, No. of cases_median, No. of cases_min, No. of cases_max, No. ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
incidence_per_1000_pop_at_risk <- read_csv("incidence_per_1000_pop_at_risk.csv")
## Rows: 2033 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, WHO Region
## dbl (2): Year, Incidence per 1000 population at risk
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
reported_numbers <- read_csv("reported_numbers.csv")
## Rows: 1944 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, WHO Region
## dbl (3): Year, No. of cases, No. of deaths
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Load geospatial data (world map)
world <- st_read("ne_10m_admin_0_countries.shp")
## Reading layer `ne_10m_admin_0_countries' from data source
## `C:\Users\sputhuma\Documents\HJF\ne_10m_admin_0_countries.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 258 features and 168 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -180 ymin: -90 xmax: 180 ymax: 83.6341
## Geodetic CRS: WGS 84
reported_numbers <- reported_numbers %>%
rename(NAME = `NAME`, `Reported Cases` = `No. of cases`, `Reported Deaths` = `No. of deaths`)
# Merge the malaria data with the geospatial data
malaria_map_data <- left_join(world, reported_numbers %>% filter(Year == max(Year)), by = "NAME")
# Check if 'Reported Cases' column exists
if(!"Reported Cases" %in% names(malaria_map_data)) {
stop("Column 'Reported Cases' does not exist in the merged data.")
}
# Handle any NAs in the merged data
malaria_map_data <- malaria_map_data %>% drop_na(`Reported Cases`)
# Fix invalid geometries
malaria_map_data <- st_make_valid(malaria_map_data)
# Visualize the map using tmap
tmap_options(check.and.fix = TRUE)
tm_shape(malaria_map_data) +
tm_polygons("Reported Cases", style = "quantile", palette = "viridis", title = "Reported Malaria Cases") +
tm_layout(title = "Global Distribution of Reported Malaria Cases",
title.position = c("left", "top"),
legend.outside = TRUE)

# ggplot2 for geospatial visualization
ggplot(data = malaria_map_data) +
geom_sf(aes(fill = `Reported Cases`)) +
scale_fill_viridis_c() +
labs(title = "Global Distribution of Reported Malaria Cases",
fill = "Reported Cases") +
theme_minimal()

# Interactive map with tmap
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(malaria_map_data) +
tm_polygons("Reported Cases", style = "quantile", palette = "viridis", title = "Reported Malaria Cases",
id = "NAME", popup.vars = c("Country Name" = "NAME", "Reported Cases" = "Reported Cases")) +
tm_layout(title = "Global Distribution of Reported Malaria Cases")
# Insight 1: Top regions affected
top_regions <- malaria_map_data %>%
st_drop_geometry() %>%
arrange(desc(`Reported Cases`)) %>%
head(10)
# Rename columns in incidence_per_1000_pop_at_risk to match those in world
incidence_per_1000_pop_at_risk <- incidence_per_1000_pop_at_risk %>%
rename(NAME = `NAME`, `Incidence per 1000 population` = `Incidence per 1000 population at risk`)
# Merge with the world data
incidence_map_data <- left_join(world, incidence_per_1000_pop_at_risk %>% filter(Year == max(Year)), by = "NAME")
# Handle any NAs in the merged data
incidence_map_data <- incidence_map_data %>% drop_na(`Incidence per 1000 population`)
# Visualize incidence data
tm_shape(incidence_map_data) +
tm_polygons("Incidence per 1000 population", style = "quantile", palette = "magma", title = "Malaria Incidence per 1000 Population at Risk",
id = "NAME", popup.vars = c("Country Name" = "NAME", "Incidence per 1000 Population" = "Incidence per 1000 population")) +
tm_layout(title = "Global Malaria Incidence",
title.position = c("left", "top"),
legend.outside = TRUE)
## Warning: The shape incidence_map_data is invalid. See sf::st_is_valid
# Insight 3: Comparison of cases and deaths
case_death_comparison <- reported_numbers %>%
filter(Year == max(Year)) %>%
select(NAME, `Reported Cases`, `Reported Deaths`) %>%
arrange(desc(`Reported Cases`))
# Filter top 50 countries by reported cases and deaths
top_50_cases <- case_death_comparison %>%
arrange(desc(`Reported Cases`)) %>%
head(50)
top_50_deaths <- case_death_comparison %>%
arrange(desc(`Reported Deaths`)) %>%
head(50)
# Create a data frame for plotting
cases_data <- top_50_cases %>%
mutate(text = paste("Country: ", NAME, "<br>Reported Cases: ", `Reported Cases`))
deaths_data <- top_50_deaths %>%
mutate(text = paste("Country: ", NAME, "<br>Reported Deaths: ", `Reported Deaths`))
# Interactive treemap for Reported Cases with unique colors for each country
plot_cases <- plot_ly(
data = cases_data,
type = "treemap",
ids = ~NAME,
labels = ~NAME,
parents = "",
values = ~`Reported Cases`,
textinfo = "label+value+percent entry",
hoverinfo = "text",
text = ~text,
marker = list(colors = ~NAME, colorscale = "Set3", showscale = FALSE)
) %>%
layout(title = "Top 50 Countries with Reported Malaria Cases (2000-2018)")
# Interactive treemap for Reported Deaths
plot_deaths <- plot_ly(
data = deaths_data,
type = "treemap",
ids = ~NAME,
labels = ~NAME,
parents = "",
values = ~`Reported Deaths`,
textinfo = "label+value+percent entry",
hoverinfo = "text",
text = ~text,
marker = list(colors = ~`Reported Deaths`, colorscale = "Reds", showscale = TRUE)
) %>%
layout(title = "Top 50 Countries with Reported Malaria Deaths (2000-2018)")
# Print the interactive plots
plot_cases
plot_deaths